{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Email_Spam_Detection.ipynb",
      "version": "0.3.2",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8EgXQGnGdIeT",
        "colab_type": "text"
      },
      "source": [
        "![spam-940521_1280.jpg](https://storage.needpix.com/rsynced_images/spam-940521_1280.jpg)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Cat4ph94Jume",
        "colab_type": "code",
        "outputId": "0feff088-30df-4759-d014-5d975f439d6b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 55
        }
      },
      "source": [
        "'''\n",
        "Description: This program detects if an email is spam (1) or not (0)\n",
        "Resources: (1) https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html\n",
        "           (2) https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\n",
        "           (3) https://www.kaggle.com/astandrik/simple-spam-filter-using-naive-bayes\n",
        "           (4) https://www.kaggle.com/dilip990/spam-ham-detection-using-naive-bayes-classifier\n",
        "           (5) https://www.geeksforgeeks.org/bag-of-words-bow-model-in-nlp/\n",
        "           (6) https://towardsdatascience.com/spam-detection-with-logistic-regression-23e3709e522\n",
        "           (7) https://github.com/SharmaNatasha/Machine-Learning-using-Python/blob/master/Classification%20project/Spam_Detection.ipynb\n",
        "Data Source: https://www.kaggle.com/balakishan77/spam-or-ham-email-classification/data\n",
        "'''\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'\\nDescription: This program detects if an email is spam (1) or not (0)\\nResources: (1) https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html\\n           (2) https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\\n           (3) https://www.kaggle.com/astandrik/simple-spam-filter-using-naive-bayes\\n           (4) https://www.kaggle.com/dilip990/spam-ham-detection-using-naive-bayes-classifier\\n           (5) https://www.geeksforgeeks.org/bag-of-words-bow-model-in-nlp/\\nData Source: https://www.kaggle.com/balakishan77/spam-or-ham-email-classification/data\\n'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 48
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VPT6pdRXhucy",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#Import libraries\n",
        "import numpy as np \n",
        "import pandas as pd \n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "import string\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CLaWTgouh4P_",
        "colab_type": "code",
        "outputId": "77909562-17c3-41e5-bee0-6d5e1b3b2e63",
        "colab": {
          "resources": {
            "http://localhost:8080/nbextensions/google.colab/files.js": {
              "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7Ci8vIE1heCBhbW91bnQgb2YgdGltZSB0byBibG9jayB3YWl0aW5nIGZvciB0aGUgdXNlci4KY29uc3QgRklMRV9DSEFOR0VfVElNRU9VVF9NUyA9IDMwICogMTAwMDsKCmZ1bmN0aW9uIF91cGxvYWRGaWxlcyhpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IHN0ZXBzID0gdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKTsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIC8vIENhY2hlIHN0ZXBzIG9uIHRoZSBvdXRwdXRFbGVtZW50IHRvIG1ha2UgaXQgYXZhaWxhYmxlIGZvciB0aGUgbmV4dCBjYWxsCiAgLy8gdG8gdXBsb2FkRmlsZXNDb250aW51ZSBmcm9tIFB5dGhvbi4KICBvdXRwdXRFbGVtZW50LnN0ZXBzID0gc3RlcHM7CgogIHJldHVybiBfdXBsb2FkRmlsZXNDb250aW51ZShvdXRwdXRJZCk7Cn0KCi8vIFRoaXMgaXMgcm91Z2hseSBhbiBhc3luYyBnZW5lcmF0b3IgKG5vdCBzdXBwb3J0ZWQgaW4gdGhlIGJyb3dzZXIgeWV0KSwKLy8gd2hlcmUgdGhlcmUgYXJlIG11bHRpcGxlIGFzeW5jaHJvbm91cyBzdGVwcyBhbmQgdGhlIFB5dGhvbiBzaWRlIGlzIGdvaW5nCi8vIHRvIHBvbGwgZm9yIGNvbXBsZXRpb24gb2YgZWFjaCBzdGVwLgovLyBUaGlzIHVzZXMgYSBQcm9taXNlIHRvIGJsb2NrIHRoZSBweXRob24gc2lkZSBvbiBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcCwKLy8gdGhlbiBwYXNzZXMgdGhlIHJlc3VsdCBvZiB0aGUgcHJldmlvdXMgc3RlcCBhcyB0aGUgaW5wdXQgdG8gdGhlIG5leHQgc3RlcC4KZnVuY3Rpb24gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpIHsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIGNvbnN0IHN0ZXBzID0gb3V0cHV0RWxlbWVudC5zdGVwczsKCiAgY29uc3QgbmV4dCA9IHN0ZXBzLm5leHQob3V0cHV0RWxlbWVudC5sYXN0UHJvbWlzZVZhbHVlKTsKICByZXR1cm4gUHJvbWlzZS5yZXNvbHZlKG5leHQudmFsdWUucHJvbWlzZSkudGhlbigodmFsdWUpID0+IHsKICAgIC8vIENhY2hlIHRoZSBsYXN0IHByb21pc2UgdmFsdWUgdG8gbWFrZSBpdCBhdmFpbGFibGUgdG8gdGhlIG5leHQKICAgIC8vIHN0ZXAgb2YgdGhlIGdlbmVyYXRvci4KICAgIG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSA9IHZhbHVlOwogICAgcmV0dXJuIG5leHQudmFsdWUucmVzcG9uc2U7CiAgfSk7Cn0KCi8qKgogKiBHZW5lcmF0b3IgZnVuY3Rpb24gd2hpY2ggaXMgY2FsbGVkIGJldHdlZW4gZWFjaCBhc3luYyBzdGVwIG9mIHRoZSB1cGxvYWQKICogcHJvY2Vzcy4KICogQHBhcmFtIHtzdHJpbmd9IGlucHV0SWQgRWxlbWVudCBJRCBvZiB0aGUgaW5wdXQgZmlsZSBwaWNrZXIgZWxlbWVudC4KICogQHBhcmFtIHtzdHJpbmd9IG91dHB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIG91dHB1dCBkaXNwbGF5LgogKiBAcmV0dXJuIHshSXRlcmFibGU8IU9iamVjdD59IEl0ZXJhYmxlIG9mIG5leHQgc3RlcHMuCiAqLwpmdW5jdGlvbiogdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKSB7CiAgY29uc3QgaW5wdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoaW5wdXRJZCk7CiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gZmFsc2U7CgogIGNvbnN0IG91dHB1dEVsZW1lbnQgPSBkb2N1bWVudC5nZXRFbGVtZW50QnlJZChvdXRwdXRJZCk7CiAgb3V0cHV0RWxlbWVudC5pbm5lckhUTUwgPSAnJzsKCiAgY29uc3QgcGlja2VkUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBpbnB1dEVsZW1lbnQuYWRkRXZlbnRMaXN0ZW5lcignY2hhbmdlJywgKGUpID0+IHsKICAgICAgcmVzb2x2ZShlLnRhcmdldC5maWxlcyk7CiAgICB9KTsKICB9KTsKCiAgY29uc3QgY2FuY2VsID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnYnV0dG9uJyk7CiAgaW5wdXRFbGVtZW50LnBhcmVudEVsZW1lbnQuYXBwZW5kQ2hpbGQoY2FuY2VsKTsKICBjYW5jZWwudGV4dENvbnRlbnQgPSAnQ2FuY2VsIHVwbG9hZCc7CiAgY29uc3QgY2FuY2VsUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBjYW5jZWwub25jbGljayA9ICgpID0+IHsKICAgICAgcmVzb2x2ZShudWxsKTsKICAgIH07CiAgfSk7CgogIC8vIENhbmNlbCB1cGxvYWQgaWYgdXNlciBoYXNuJ3QgcGlja2VkIGFueXRoaW5nIGluIHRpbWVvdXQuCiAgY29uc3QgdGltZW91dFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgc2V0VGltZW91dCgoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9LCBGSUxFX0NIQU5HRV9USU1FT1VUX01TKTsKICB9KTsKCiAgLy8gV2FpdCBmb3IgdGhlIHVzZXIgdG8gcGljayB0aGUgZmlsZXMuCiAgY29uc3QgZmlsZXMgPSB5aWVsZCB7CiAgICBwcm9taXNlOiBQcm9taXNlLnJhY2UoW3BpY2tlZFByb21pc2UsIHRpbWVvdXRQcm9taXNlLCBjYW5jZWxQcm9taXNlXSksCiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdzdGFydGluZycsCiAgICB9CiAgfTsKCiAgaWYgKCFmaWxlcykgewogICAgcmV0dXJuIHsKICAgICAgcmVzcG9uc2U6IHsKICAgICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICAgIH0KICAgIH07CiAgfQoKICBjYW5jZWwucmVtb3ZlKCk7CgogIC8vIERpc2FibGUgdGhlIGlucHV0IGVsZW1lbnQgc2luY2UgZnVydGhlciBwaWNrcyBhcmUgbm90IGFsbG93ZWQuCiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gdHJ1ZTsKCiAgZm9yIChjb25zdCBmaWxlIG9mIGZpbGVzKSB7CiAgICBjb25zdCBsaSA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2xpJyk7CiAgICBsaS5hcHBlbmQoc3BhbihmaWxlLm5hbWUsIHtmb250V2VpZ2h0OiAnYm9sZCd9KSk7CiAgICBsaS5hcHBlbmQoc3BhbigKICAgICAgICBgKCR7ZmlsZS50eXBlIHx8ICduL2EnfSkgLSAke2ZpbGUuc2l6ZX0gYnl0ZXMsIGAgKwogICAgICAgIGBsYXN0IG1vZGlmaWVkOiAkewogICAgICAgICAgICBmaWxlLmxhc3RNb2RpZmllZERhdGUgPyBmaWxlLmxhc3RNb2RpZmllZERhdGUudG9Mb2NhbGVEYXRlU3RyaW5nKCkgOgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnbi9hJ30gLSBgKSk7CiAgICBjb25zdCBwZXJjZW50ID0gc3BhbignMCUgZG9uZScpOwogICAgbGkuYXBwZW5kQ2hpbGQocGVyY2VudCk7CgogICAgb3V0cHV0RWxlbWVudC5hcHBlbmRDaGlsZChsaSk7CgogICAgY29uc3QgZmlsZURhdGFQcm9taXNlID0gbmV3IFByb21pc2UoKHJlc29sdmUpID0+IHsKICAgICAgY29uc3QgcmVhZGVyID0gbmV3IEZpbGVSZWFkZXIoKTsKICAgICAgcmVhZGVyLm9ubG9hZCA9IChlKSA9PiB7CiAgICAgICAgcmVzb2x2ZShlLnRhcmdldC5yZXN1bHQpOwogICAgICB9OwogICAgICByZWFkZXIucmVhZEFzQXJyYXlCdWZmZXIoZmlsZSk7CiAgICB9KTsKICAgIC8vIFdhaXQgZm9yIHRoZSBkYXRhIHRvIGJlIHJlYWR5LgogICAgbGV0IGZpbGVEYXRhID0geWllbGQgewogICAgICBwcm9taXNlOiBmaWxlRGF0YVByb21pc2UsCiAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgYWN0aW9uOiAnY29udGludWUnLAogICAgICB9CiAgICB9OwoKICAgIC8vIFVzZSBhIGNodW5rZWQgc2VuZGluZyB0byBhdm9pZCBtZXNzYWdlIHNpemUgbGltaXRzLiBTZWUgYi82MjExNTY2MC4KICAgIGxldCBwb3NpdGlvbiA9IDA7CiAgICB3aGlsZSAocG9zaXRpb24gPCBmaWxlRGF0YS5ieXRlTGVuZ3RoKSB7CiAgICAgIGNvbnN0IGxlbmd0aCA9IE1hdGgubWluKGZpbGVEYXRhLmJ5dGVMZW5ndGggLSBwb3NpdGlvbiwgTUFYX1BBWUxPQURfU0laRSk7CiAgICAgIGNvbnN0IGNodW5rID0gbmV3IFVpbnQ4QXJyYXkoZmlsZURhdGEsIHBvc2l0aW9uLCBsZW5ndGgpOwogICAgICBwb3NpdGlvbiArPSBsZW5ndGg7CgogICAgICBjb25zdCBiYXNlNjQgPSBidG9hKFN0cmluZy5mcm9tQ2hhckNvZGUuYXBwbHkobnVsbCwgY2h1bmspKTsKICAgICAgeWllbGQgewogICAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgICBhY3Rpb246ICdhcHBlbmQnLAogICAgICAgICAgZmlsZTogZmlsZS5uYW1lLAogICAgICAgICAgZGF0YTogYmFzZTY0LAogICAgICAgIH0sCiAgICAgIH07CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPQogICAgICAgICAgYCR7TWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCl9JSBkb25lYDsKICAgIH0KICB9CgogIC8vIEFsbCBkb25lLgogIHlpZWxkIHsKICAgIHJlc3BvbnNlOiB7CiAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgIH0KICB9Owp9CgpzY29wZS5nb29nbGUgPSBzY29wZS5nb29nbGUgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYiA9IHNjb3BlLmdvb2dsZS5jb2xhYiB8fCB7fTsKc2NvcGUuZ29vZ2xlLmNvbGFiLl9maWxlcyA9IHsKICBfdXBsb2FkRmlsZXMsCiAgX3VwbG9hZEZpbGVzQ29udGludWUsCn07Cn0pKHNlbGYpOwo=",
              "ok": true,
              "headers": [
                [
                  "content-type",
                  "application/javascript"
                ]
              ],
              "status": 200,
              "status_text": ""
            }
          },
          "base_uri": "https://localhost:8080/",
          "height": 75
        }
      },
      "source": [
        "#Load the data\n",
        "from google.colab import files # Use to load data on Google Colab\n",
        "uploaded = files.upload() # Use to load data on Google Colab\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/html": [
              "\n",
              "     <input type=\"file\" id=\"files-4b45367c-899f-47b2-8615-a9837a10bc1a\" name=\"files[]\" multiple disabled />\n",
              "     <output id=\"result-4b45367c-899f-47b2-8615-a9837a10bc1a\">\n",
              "      Upload widget is only available when the cell has been executed in the\n",
              "      current browser session. Please rerun this cell to enable.\n",
              "      </output>\n",
              "      <script src=\"/nbextensions/google.colab/files.js\"></script> "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "Saving emails.csv to emails (1).csv\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "L2k5IZlCeCsf",
        "colab_type": "code",
        "outputId": "1893e173-1d98-4354-e87e-1bc20475c503",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 202
        }
      },
      "source": [
        "df = pd.read_csv('emails.csv') #read the CSV file\n",
        "df.head(5)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>text</th>\n",
              "      <th>spam</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Subject: naturally irresistible your corporate...</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Subject: the stock trading gunslinger  fanny i...</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Subject: unbelievable new homes made easy  im ...</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Subject: 4 color printing special  request add...</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Subject: do not have money , get software cds ...</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                text  spam\n",
              "0  Subject: naturally irresistible your corporate...     1\n",
              "1  Subject: the stock trading gunslinger  fanny i...     1\n",
              "2  Subject: unbelievable new homes made easy  im ...     1\n",
              "3  Subject: 4 color printing special  request add...     1\n",
              "4  Subject: do not have money , get software cds ...     1"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0D7IQYoEjedi",
        "colab_type": "code",
        "outputId": "88817564-26fd-4a0d-8bc2-a75f0453f1a3",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "#Print the shape (Get the number of rows and cols)\n",
        "df.shape"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(5728, 2)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 51
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eKyhDtN-jkX_",
        "colab_type": "code",
        "outputId": "9371ed55-670d-4934-e5eb-fe6d194aefbd",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "#Get the column names\n",
        "df.columns"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "Index(['text', 'spam'], dtype='object')"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 52
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xr511l-ajqdX",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#Checking for duplicates and removing them\n",
        "df.drop_duplicates(inplace = True)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XInJwy4PkdMQ",
        "colab_type": "code",
        "outputId": "156b9b79-7a3d-4355-81cf-2d32b7c7c427",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "#Show the new shape (number of rows & columns)\n",
        "df.shape"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(5695, 2)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 54
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "o2eNWsvEjrJm",
        "colab_type": "code",
        "outputId": "70497a76-3364-4285-9c28-7896ee5c2752",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 70
        }
      },
      "source": [
        "#Show the number of missing (NAN, NaN, na) data for each column\n",
        "df.isnull().sum()"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "text    0\n",
              "spam    0\n",
              "dtype: int64"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 55
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wPudb_qxHiCW",
        "colab_type": "code",
        "outputId": "0b818f71-243e-4c62-9a1f-9b8f3be5969d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 69
        }
      },
      "source": [
        "#Need to download stopwords\n",
        "nltk.download('stopwords')"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nOT_3dp2ulx8",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#Tokenization (a list of tokens), will be used as the analyzer\n",
        "#1.Punctuations are [!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]\n",
        "#2.Stop words in natural language processing, are useless words (data).\n",
        "def process_text(text):\n",
        "    '''\n",
        "    What will be covered:\n",
        "    1. Remove punctuation\n",
        "    2. Remove stopwords\n",
        "    3. Return list of clean text words\n",
        "    '''\n",
        "    \n",
        "    #1\n",
        "    nopunc = [char for char in text if char not in string.punctuation]\n",
        "    nopunc = ''.join(nopunc)\n",
        "    \n",
        "    #2\n",
        "    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]\n",
        "    \n",
        "    #3\n",
        "    return clean_words"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "g9ls1rIOuxYY",
        "colab_type": "code",
        "outputId": "5c77db00-a954-471b-8f78-4a162ec680e8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 121
        }
      },
      "source": [
        "#Show the Tokenization (a list of tokens )\n",
        "df['text'].head().apply(process_text)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0    [Subject, naturally, irresistible, corporate, ...\n",
              "1    [Subject, stock, trading, gunslinger, fanny, m...\n",
              "2    [Subject, unbelievable, new, homes, made, easy...\n",
              "3    [Subject, 4, color, printing, special, request...\n",
              "4    [Subject, money, get, software, cds, software,...\n",
              "Name: text, dtype: object"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lFnqyCKGD_LX",
        "colab_type": "code",
        "outputId": "f1a31c5a-4198-4c17-c4d3-dbb56c5e601d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 173
        }
      },
      "source": [
        "'''\n",
        "EXAMPLE OF THE PROCESS TO PREPARE THE DATA FOR TRAINING ON THE CLASSIFIER, \n",
        "THIS CELL/BLOCK ISNT NECESSARY TO RUN THE PROGRAM\n",
        "'''\n",
        "#Print the text (aka the email message)\n",
        "message4 = 'hello world hello hello world play' #df['text'][3]\n",
        "message5 = 'test test test test one hello'\n",
        "print(message4)\n",
        "print()\n",
        "\n",
        "#Convert a collection of text documents to a matrix of token counts\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "bow4 =  CountVectorizer(analyzer=process_text).fit_transform([[message4], [message5]])\n",
        "print(bow4)\n",
        "print()\n",
        "\n",
        "#Transform a count matrix to a normalized tf or tf-idf representation\n",
        "#Show the weight of TF-IDF\n",
        "#Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. \n",
        "#from sklearn.feature_extraction.text import TfidfTransformer\n",
        "#tfidf4 = TfidfTransformer().fit_transform(bow4)\n",
        "#print(tfidf4)\n",
        "#print()\n",
        "\n",
        "#Print the shape (number of rows & columns) of bow4 \n",
        "#print(bow4.shape)\n",
        "#print()\n",
        "\n",
        "#Show bow4 row at index 1\n",
        "#bow4[1]"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "hello world hello hello world play\n",
            "\n",
            "  (0, 0)\t3\n",
            "  (0, 4)\t2\n",
            "  (0, 2)\t1\n",
            "  (1, 0)\t1\n",
            "  (1, 3)\t4\n",
            "  (1, 1)\t1\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RJqHT1CbqLuI",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Convert a collection of text documents to a matrix of token counts\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "\n",
        "## Converts strings to integer counts\n",
        "#vectorizer = CountVectorizer(analyzer=process_text)\n",
        "\n",
        "## Learn a vocabulary dictionary of all tokens in the raw documents.\n",
        "#bow_transformer = vectorizer.fit(df['text'])    \n",
        "\n",
        "## Transform documents to document-term matrix.\n",
        "#messages_bow = bow_transformer.transform(df['text'])\n",
        "\n",
        "#Convert string to integer counts, learn the vocabulary dictionary and return term-document matrix\n",
        "messages_bow = CountVectorizer(analyzer=process_text).fit_transform(df['text'])\n",
        "\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "w8nJlRb1z-Bi",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Transform a count matrix to a normalized tf or tf-idf representation\n",
        "# Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. \n",
        "#from sklearn.feature_extraction.text import TfidfTransformer\n",
        "\n",
        "## Learn the idf vector (global term weights)\n",
        "#tfidf_transformer=TfidfTransformer().fit(messages_bow)\n",
        "\n",
        "## Transform a count matrix to a tf or tf-idf representation\n",
        "#messages_tfidf = tfidf_transformer.transform(messages_bow)\n",
        "\n",
        "#Learn the idf vector to fit to data, then transform it.\n",
        "#messages_tfidf = TfidfTransformer().fit_transform(messages_bow)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iZSWZRaTdKCq",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#Split the data into 80% training (X_train & y_train) and 20% testing (X_test & y_test) data sets\n",
        "from sklearn.model_selection import train_test_split\n",
        "X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['spam'], test_size = 0.20, random_state = 0)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SkNZ1TNUoq8e",
        "colab_type": "code",
        "outputId": "78c27c42-764b-43c7-e7d3-16a7521b9ae7",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "#Get the shape of messages_bow\n",
        "messages_bow.shape"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(5695, 37229)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 40
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0Pm4NudwNoS2",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#Get the shape of messages_tfidf\n",
        "#messages_tfidf.shape"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SQhHPp1nqhio",
        "colab_type": "code",
        "outputId": "de9068e3-cb4c-437f-e96f-b17998bffa2f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "#Create and train the Naive Bayes classifier\n",
        "#The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "classifier = MultinomialNB()\n",
        "classifier.fit(X_train, y_train)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 26
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vlunNgqM6PUj",
        "colab_type": "code",
        "outputId": "c2b5a434-52d1-4544-f6f0-196ad53fee80",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        }
      },
      "source": [
        "#Print the predictions\n",
        "print(classifier.predict(X_train))\n",
        "\n",
        "#Print the actual values\n",
        "print(y_train.values)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[0 0 0 ... 0 0 0]\n",
            "[0 0 0 ... 0 0 0]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "P2hchOco7Nqj",
        "colab_type": "code",
        "outputId": "aa6574af-a626-4cb9-8195-89d2ec33d620",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 260
        }
      },
      "source": [
        "#Evaluate the model on the training data set\n",
        "from sklearn.metrics import classification_report,confusion_matrix, accuracy_score\n",
        "pred = classifier.predict(X_train)\n",
        "print(classification_report(y_train ,pred ))\n",
        "print('Confusion Matrix: \\n',confusion_matrix(y_train,pred))\n",
        "print()\n",
        "print('Accuracy: ', accuracy_score(y_train,pred))"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       1.00      1.00      1.00      3457\n",
            "           1       0.99      1.00      0.99      1099\n",
            "\n",
            "    accuracy                           1.00      4556\n",
            "   macro avg       0.99      1.00      1.00      4556\n",
            "weighted avg       1.00      1.00      1.00      4556\n",
            "\n",
            "Confusion Matrix: \n",
            " [[3445   12]\n",
            " [   1 1098]]\n",
            "\n",
            "Accuracy:  0.9971466198419666\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xSat9N4sjc_r",
        "colab_type": "code",
        "outputId": "3d0074d1-0b68-47c6-e4d4-0010a99dcd81",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        }
      },
      "source": [
        "#Print the predictions\n",
        "print('Predicted value: ',classifier.predict(X_test))\n",
        "\n",
        "#Print Actual Label\n",
        "print('Actual value: ',y_test.values)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Predicted value:  [1 0 0 ... 0 0 0]\n",
            "Actual value:  [1 0 0 ... 0 0 0]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "d5WM2akdjoeM",
        "colab_type": "code",
        "outputId": "5b59cc3d-4528-45fc-bf0a-e47228d28c50",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 260
        }
      },
      "source": [
        "#Evaluate the model on the test data set\n",
        "from sklearn.metrics import classification_report,confusion_matrix, accuracy_score\n",
        "pred = classifier.predict(X_test)\n",
        "print(classification_report(y_test ,pred ))\n",
        "\n",
        "print('Confusion Matrix: \\n', confusion_matrix(y_test,pred))\n",
        "print()\n",
        "print('Accuracy: ', accuracy_score(y_test,pred))"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       1.00      0.99      0.99       870\n",
            "           1       0.97      1.00      0.98       269\n",
            "\n",
            "    accuracy                           0.99      1139\n",
            "   macro avg       0.98      0.99      0.99      1139\n",
            "weighted avg       0.99      0.99      0.99      1139\n",
            "\n",
            "Confusion Matrix: \n",
            " [[862   8]\n",
            " [  1 268]]\n",
            "\n",
            "Accuracy:  0.9920983318700615\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}
